#remove warnings
import warnings
warnings.filterwarnings("ignore")
#Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pydotplus
from sklearn import metrics
from sklearn.metrics.regression import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from copy import deepcopy
from sklearn.svm import LinearSVR
from sklearn import preprocessing
from scipy.spatial.distance import euclidean
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import k_means
from sklearn.externals.six import StringIO
from IPython.display import Image
#Global variables
env_params = {
"InsConColumns" : ['Premiums in LOB: Motor',
'Premiums in LOB: Household',
'Premiums in LOB: Health',
'Premiums in LOB: Life',
'Premiums in LOB: Work Compensations',
'Premium: Sum'],
"ValEngColumns" : ['Educational Degree',
'Geographic Living Area',
'Has Children (Y=1)',
'Gross Monthly Salary',
'Customer Monetary Value',
'First Policy´s Age'],
"CategoricalColumns" : ['Educational Degree',
'Geographic Living Area',
'Has Children (Y=1)'],
"NumericalColumns" : ['First Policy´s Year',
'Gross Monthly Salary',
'Customer Monetary Value',
'Claims Rate',
'Premiums in LOB: Motor',
'Premiums in LOB: Household',
'Premiums in LOB: Health',
'Premiums in LOB: Life',
'Premiums in LOB: Work Compensations',
'Age',
'First Policy´s Age',
'Premium: Sum']
}
#Function to split the DataFrame in data complete (without NaNs on the same row) and incomplete
#The rows that belong to the 'incomplete' dataframe have at least one NaN
def split(data_insurance, reset_index = False):
data_insurance_complete = pd.DataFrame()
data_insurance_incomplete = data_insurance[data_insurance.isna().any(axis=1)]
if reset_index:
data_insurance_incomplete.reset_index(inplace=True)
data_insurance_incomplete.drop('index', axis=1, inplace=True)
data_insurance_complete = data_insurance[~data_insurance.isna().any(axis=1)]
return data_insurance_complete, data_insurance_incomplete
#Function to plot the histogram of a variable (feature)
def histogram(df, features):
fig = plt.figure(figsize=(12, 20))
for i, feature in enumerate(features):
# Set up the plot
ax = fig.add_subplot((len(features)//3)+1, 3, i + 1)
# Draw the plot
ax.hist(df[feature], bins = 50,
color = 'blue', edgecolor = 'black')
# Title and labels
ax.set_title('Histogram of ' + feature)
ax.set_xlabel(feature)
ax.set_ylabel('Customers')
plt.subplots_adjust(hspace = 0.2)
plt.show()
#Function to plot the correlation between variables
def plotCorrelation(df):
sns.set()
fig, ax = plt.subplots(figsize=(9,6))
plt.rcParams.update({'font.size': 11})
sns.heatmap(df.corr(method='pearson'), annot=True, fmt='.2f', cmap='RdBu', vmin=-1, vmax=1, linewidths=.9, ax = ax).set_title('Variables correlation')
ax.set_xticks(np.arange(df.corr(method='pearson').shape[1]+1), minor=True)
ax.set_yticks(np.arange(df.corr(method='pearson').shape[0]+1), minor=True)
ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
ax.tick_params(which="minor", bottom=True, left=True)
#Function to separate the dataframe into 'Insurance Consumption' and 'Value & Engagement' features
def separateVariables(df):
InsCon = df[env_params['InsConColumns']]
ValEng = df[env_params['ValEngColumns']]
return InsCon, ValEng
#Function to perform outliers removal based on interquartile range (IQR)
def removeOutliers(df, cutoff1, cutoff2, special_features):
outliers_to_remove = []
for feature in env_params['NumericalColumns']:
if feature in df:
#calculate interquartile range
q25, q75 = np.nanpercentile(df[feature], 25), np.nanpercentile(df[feature], 75)
iqr = q75 - q25
print(feature)
print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25, q75, iqr))
#calculate the outlier cutoff
if feature not in special_features:
cut_off = iqr * cutoff1
lower, upper = q25 - cut_off, q75 + cut_off
else:
cut_off = iqr * cutoff2
lower, upper = q25 - cut_off, q75 + cut_off
#identify outliers
outliers = [index for index, value in df[feature].iteritems() if value < lower or value > upper]
print('Identified outliers: %d' % len(outliers))
outliers_to_remove += outliers
outliers_to_remove = set(outliers_to_remove)
outliers_to_remove = sorted(outliers_to_remove)
return outliers_to_remove
#Function to rescale the dataframe columns
def rescale(data_insurance):
data_insurance_ = deepcopy(data_insurance)
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(data_insurance_.drop(columns=env_params['CategoricalColumns'], axis = 1))
scaled_features = pd.DataFrame(scaled_features, columns = data_insurance_.drop(columns=env_params['CategoricalColumns']).columns)
scaled_data_insurance = pd.merge(scaled_features, data_insurance_[env_params['CategoricalColumns']], left_index=True, right_index=True)
return scaled_data_insurance
#Function to evaluate the best n_neighbors to use with KNN
def evaluateClassifier(data_insurance):
data_insurance_complete, data_insurance_incomplete = split(data_insurance, reset_index=True)
def createAndFitClassifier(k):
clf = KNeighborsClassifier(n_neighbors=k)
incomplete = deepcopy(data_insurance_incomplete)
complete = deepcopy(data_insurance_complete)
X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != value].values,
complete.loc[:,value].values, test_size = 0.2, random_state = 0)
trained_model = clf.fit(X_train, y_train)
result = [clf, y_test, X_test, trained_model, incomplete, complete]
return result
accuracies_for_value_dict = {}
for index, value in enumerate(env_params['CategoricalColumns']):
accuracy_dict = {}
for k in range(3,100):
result = createAndFitClassifier(k)
clf = result[0]
y_test = result[1]
X_test = result[2]
#calculate the model accuracy and storing the value into a dictionary
y_pred = clf.predict(X_test)
accuracy_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_matrix.trace()/accuracy_matrix.sum()
accuracy_dict[k] = accuracy
accuracies_for_value_dict[value] = accuracy_dict
return accuracies_for_value_dict
#Function that uses KNN to classify the missing values on CATEGORICAL columns
def classifyCategoricalData(data_insurance, n_neighbors_dict):
data_insurance_complete, data_insurance_incomplete = split(data_insurance, reset_index=True)
#Creating a classifier to fill the categorical data: Educational Degree, Geographic Living Area and Has Children (Y=1)
for index, value in enumerate(env_params['CategoricalColumns']):
if value in n_neighbors_dict:
clf = KNeighborsClassifier(n_neighbors=n_neighbors_dict[value])
incomplete = deepcopy(data_insurance_incomplete)
complete = deepcopy(data_insurance_complete)
X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != value].values,
complete.loc[:,value].values, test_size = 0.2, random_state = 0)
trained_model = clf.fit(X_train,
y_train)
#fill the numerical columns with the column mean
incomplete.loc[:, ~incomplete.columns.isin(env_params['CategoricalColumns']) ] = incomplete.loc[:,
~incomplete.columns.isin(env_params['CategoricalColumns'])].apply(lambda column: column.fillna(column.mean()), axis=0)
#Round Age and First Policy's Year
#incomplete['Age'] = incomplete['Age'].apply(lambda x:round(x))
incomplete['First Policy´s Age'] = incomplete['First Policy´s Age'].apply(lambda x:round(x))
#Categorical columns with the exception of the one we want to predict
cat_without_the_column = deepcopy(env_params['CategoricalColumns'])
cat_without_the_column.pop(index)
#Fill the categorical columns with the exception of the one we want to predict with the mode
incomplete.loc[:, incomplete.columns.isin(cat_without_the_column) ] = incomplete.loc[:,
incomplete.columns.isin(cat_without_the_column)].apply(lambda column: column.fillna(int(column.mode())), axis=0)
prediction = trained_model.predict(incomplete.loc[:,incomplete.columns != value])
temp_df = pd.DataFrame(prediction.reshape(-1,1), columns = [value])
#now we are filling data_insurance_incomplete
for ind in range(len(temp_df)):
if np.isnan(data_insurance_incomplete[value][ind]):
data_insurance_incomplete[value][ind] = temp_df[value][ind]
#and reconstructing the original dataframe
dataset = pd.concat([data_insurance_complete, data_insurance_incomplete])
dataset.set_index(dataset['Customer Identity'] - 1, inplace=True)
return dataset
#funcion for checking which algorithm is the best for using on each column for NUMERICAL columns
def checking_choices(data_insurance, number_of_tests=10):
data_insurance_complete, data_insurance_incomplete = split(data_insurance)
choices = []
better_for_each_column = []
test_errors = {}
#testing
for i in range(number_of_tests):
test, error_list = regressor_test(data_insurance)
choices.append(test)
test_errors.update({'Test: '+ str(i):error_list})
#chosing the best algorithm for each column
for i in range(len(data_insurance.columns)):
l = []
for j in range(len(choices)):
l.append(choices[j][i])
better_for_each_column.append(max(set(l), key = l.count))
#return the better algorithm for each column, and the error list
return better_for_each_column, test_errors
#function for test which regressor is best for each numerical column
#Return a list of lists with the best algorithm for each test (choose the number of tests on checking_choices function )
def regressor_test(data_insurance):
#variables to hold the Mean Squared Errors for each model
dt_errors = []
linear_errors = []
svr_errors = []
complete,incomplete = split(data_insurance)
for i in complete.columns:
i = complete.columns[0]
X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != i].values,
complete.loc[:,i].values, test_size = 0.2, random_state = 0)
regressor1 = DecisionTreeRegressor(min_samples_split=112,min_samples_leaf=9)
regressor2= LinearRegression()
regressor3=LinearSVR()
DT_Model = regressor1.fit(X_train,
y_train)
Linear_trained_model2 = regressor2.fit(X_train,
y_train)
SVR_trained_model3 = regressor3.fit(X_train,
y_train)
incomplete_2 = deepcopy(incomplete)
incomplete_2.loc[:, incomplete.columns != i] = incomplete_2.loc[:,
incomplete.columns != i].apply(lambda row: row.fillna(row.mean()), axis=1)
y_pred1 = regressor1.predict(X_test)
y_pred2 = regressor2.predict(X_test)
y_pred3 = regressor3.predict(X_test)
dt_errors.append(regressor1.score(X_test, y_test))
linear_errors.append(regressor2.score(X_test, y_test))
svr_errors.append(regressor3.score(X_test, y_test))
errors_dict = {}
errors_dict.update({"KN-Errors": dt_errors})
errors_dict.update( {"Linear-Errors": linear_errors})
errors_dict.update( {"SVR-Errors": svr_errors})
#ROOT MEAN SQUARED ERROR
R2 = []
#Filling R Squared for each column
for i in range(0, len(complete.columns)):
l = []
l.extend((dt_errors[i], linear_errors[i], svr_errors[i]))
if min(l) == dt_errors[i]:
R2.append("KNN")
elif min(l) == linear_errors[i]:
R2.append("Linear")
elif min(l) == svr_errors[i]:
R2.append("SVR")
return R2, errors_dict
#function to apply the regressors
def apply_regressors(choices, data_insurance, numerical_columns):
complete,incomplete = split(data_insurance)
for i,v in enumerate(complete.columns):
#Check if it is a numerical column
if v in numerical_columns:
#use the choosen algorithm
if choices[i] == 'KNN':
regressor = KNeighborsRegressor(5,
weights ='distance',
metric = 'euclidean')
elif choices[i] == 'SVR':
regressor = LinearSVR()
elif choices[i] == 'Linear':
regressor = LinearRegression()
#Split in train-test data
X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != v].values,
complete.loc[:,v].values, test_size = 0.2, random_state = 0)
#Train the model
trained_model = regressor.fit(X_train,
y_train)
#Make predictions
incomplete_2 = deepcopy(incomplete)
incomplete_2.loc[:, incomplete.columns != v] = incomplete_2.loc[:,
incomplete.columns != v].apply(lambda row: row.fillna(row.mean()), axis=1)
prediction = trained_model.predict(incomplete_2.loc[:,incomplete_2.columns != v])
temp_df = pd.DataFrame(prediction.reshape(-1,1), columns = [v])
#fill NaN's on data_arrivals_incomplete
for index in range(len(temp_df)):
if np.isnan(incomplete.iloc[index,i]):
incomplete.iloc[index,i] = temp_df[v][index]
#and filling the nan's on arrivals_df
dataset = pd.concat([complete, incomplete])
dataset.set_index(dataset['Customer Identity'] - 1, inplace=True)
return dataset
#Given two features (columns) this function divides a dataframe into quartiles (number of customer on each quartile cell)
def quartileMatrix(df, feature1, feature2):
quartile_id = ['q1','q2','q3','q4']
df['f1_quartile'] = pd.qcut(df[feature1], 4, quartile_id)
df['f2_quartile'] = pd.qcut(df[feature2], 4, quartile_id)
quartiles_df = pd.DataFrame(index=quartile_id, columns=quartile_id)
for i in quartile_id:
for j in quartile_id:
quartiles_df.ix[i,j] = len(df[(df["f1_quartile"]==i) & (df["f2_quartile"]==j)])
return quartiles_df
#_________________________Cleaning and Filling the Data with the algorithms___________________________________________
#Read the dataset
insurance_df = pd.read_csv('https://drive.google.com/uc?export=download&id=1dCJHxjPPob8sEfyY6ddja0DeYpos5Uq7')
#Create Age column
insurance_df['Age'] = insurance_df.loc[:, 'Brithday Year'].apply(lambda x : 2018 - x )
#Create First Policy´s Age column
insurance_df['First Policy´s Age'] = insurance_df.loc[:, 'First Policy´s Year'].apply(lambda x : 2018 - x )
#Drop Birthday Year and First Policy´s Year
insurance_df.drop(['Brithday Year', 'First Policy´s Year' ], axis=1, inplace=True)
#Adding the sum of all premiums paid as a column
insurance_df['Premium: Sum']=insurance_df[['Premiums in LOB: Work Compensations',
'Premiums in LOB: Life',
'Premiums in LOB: Health',
'Premiums in LOB: Household',
'Premiums in LOB: Motor']].sum(axis=1)
#Count the number of rows with NaNs
rows_with_nans = insurance_df.shape[0] - insurance_df.dropna().shape[0]
print(rows_with_nans)
print(rows_with_nans/insurance_df.shape[0])
outliers = removeOutliers(insurance_df, 1.5, 1.5, [])
features = ['Gross Monthly Salary',
'Customer Monetary Value',
'Claims Rate',
'Premiums in LOB: Motor',
'Premiums in LOB: Household',
'Premiums in LOB: Health',
'Premiums in LOB: Life',
'Premiums in LOB: Work Compensations',
'Age',
'First Policy´s Age',
'Premium: Sum']
histogram(insurance_df, features)
outliers = removeOutliers(insurance_df, 1.5, 3, features)
print(len(outliers))
print(insurance_df.shape[0])
print(len(outliers)/insurance_df.shape[0])
outliers_data = []
for i in outliers:
outliers_data.append(insurance_df.iloc[i,:])
insurance_df.drop(outliers, inplace=True)
insurance_df.shape
histogram(insurance_df, features)
#Drop rows with more than 3 NaN's
insurance_df.dropna(thresh=(len(insurance_df.columns) - 3), inplace=True, axis=0)
insurance_df.shape
#Counting the rows with 'First Policy´s Age' > 'Age'
#It shows us a possible problem with the data
insurance_df[insurance_df['First Policy´s Age'] > insurance_df['Age']].shape[0]
#Proportion of customers below 18 that have children
#Is 'Age' trustworthy?
x = insurance_df[(insurance_df['Age'] <= 18) & (insurance_df['Has Children (Y=1)'] == 1)].shape[0]
y = insurance_df[(insurance_df['Age'] <= 18) & (insurance_df['Has Children (Y=1)'] == 0)].shape[0]
print("{:.2f}".format(x/(x+y)*100),'%')
data_insurance = deepcopy(insurance_df)
#Encoding Educational Degree and returning back the NaN's
data_insurance['Educational Degree'] = data_insurance['Educational Degree'].apply(str)
labelencoder_X = LabelEncoder()
data_insurance.loc[:,'Educational Degree'] = labelencoder_X.fit_transform(data_insurance.loc[:,'Educational Degree'])
data_insurance['Educational Degree'] = data_insurance['Educational Degree'].apply(lambda x : np.nan if x == 4 else x )
data_insurance.tail()
#Verify the optimal n_neighbors to our KNN classifiers
scaled_data_insurance = rescale(data_insurance)
scaled_data_insurance.drop('Customer Identity', axis=1)
accuracies_for_column_dict = evaluateClassifier(scaled_data_insurance)
fig, ax = plt.subplots(3, figsize=(15,10))
fig.suptitle('KNN - Accuracy x n_neighbors')
ax[0].plot(list(accuracies_for_column_dict['Educational Degree'].keys()),
list(accuracies_for_column_dict['Educational Degree'].values()),
'bx-')
ax[0].set_title('Educational Degree')
ax[0].grid(True, which='both', color='r', linestyle='-', linewidth=2)
ax[1].plot(list(accuracies_for_column_dict['Geographic Living Area'].keys()),
list(accuracies_for_column_dict['Geographic Living Area'].values()),
'bx-')
ax[1].set_title('Geographic Living Area')
ax[0].grid(True, which='both', color='r', linestyle='-', linewidth=2)
ax[2].plot(list(accuracies_for_column_dict['Has Children (Y=1)'].keys()),
list(accuracies_for_column_dict['Has Children (Y=1)'].values()),
'bx-')
ax[2].set_title('Has Children (Y=1)')
ax[2].grid(True, which='both', color='r', linestyle='-', linewidth=2)
for ax in ax.flat:
ax.set(xlabel='n_neighbors', ylabel='Accuracy')
plt.show()
#Considering we didn't achieve good estimations for the first two categorical columns, we will drop the rows which contain NaNs
#on these columns
data_insurance.dropna(subset=['Educational Degree', 'Geographic Living Area'], inplace=True)
#Setting the optimal number of neighbors to categorical column regression
n_neighbors_dict = {#'Educational Degree' : 8,
#'Geographic Living Area' : 11,
'Has Children (Y=1)' : 21}
#Fill categorical data with the KNN predicted Values
data_insurance = classifyCategoricalData(data_insurance, n_neighbors_dict)
#Check choices and errors
choices, errors = checking_choices(data_insurance)
errors
'''Fill numerical data with the best regressor algorithm - We decided not to use apply_regressors function anymore.
after checking the errors of choices we decided to use a genetic algorithm for getting better parameters using
decision tree regressor'''
#data_insurance = apply_regressors(choices,data_insurance, env_params['NumericalColumns'])
#Full dataset
Summarize:
Low R² - Discarded (Drop NaN's)
The best solution was Test 5 - [‘friedman_mse’, min_sample_split=122, min_samples_leaf=1, max_features=3, max_depth=28]
Fitness: 0.8157 – R² ~ 81.57%
Low R² - Discarded (Drop NaN's)
The best solution was Test 4 - [‘friedman_mse’, 2, min_samples_leaf=3, max_features=9, max_depth=9]
Fitness: 0.6286 – R² ~ 62.86%
Low R² - Discarded (Drop NaN's)
Low R² - Discarded (Drop NaN's)
'''This part of the code fill the NaN's on Columns Motor and Life Premiums with the best
result from Genetic Algorithm configuration'''
column_for_regress = {'Premiums in LOB: Motor':['friedman_mse', 122, 1,3, 28],
'Premiums in LOB: Life': ['friedman_mse', 2, 3, 9, 9]}
for i in column_for_regress:
complete,incomplete = split(data_insurance)
#Split in train-test data
X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != i].values,
complete.loc[:,i].values, test_size = 0.35, random_state = 1)
regressor = DecisionTreeRegressor(criterion=column_for_regress[i][0],
min_samples_split =column_for_regress[i][1],
min_samples_leaf=column_for_regress[i][2],
max_features=column_for_regress[i][3],
max_depth=column_for_regress[i][4],
random_state=1,
)
#Train the model
trained_model = regressor.fit(X_train,
y_train)
#Make predictions
incomplete_2 = deepcopy(incomplete)
incomplete_2.loc[:, incomplete.columns != i] = incomplete_2.loc[:,
incomplete.columns != i].apply(lambda row: row.fillna(row.mean()), axis=1)
prediction = trained_model.predict(incomplete_2.loc[:,incomplete_2.columns != i])
temp_df = pd.DataFrame(prediction.reshape(-1,1), columns = [i])
col_ind = 0
if i == 'Premiums in LOB: Motor':
col_ind = -7
elif i == 'Premiums in LOB: Life':
col_ind = -4
#fill NaN's on data_arrivals_incomplete
for index in range(len(temp_df)):
if np.isnan(incomplete.iloc[index,col_ind]):
incomplete.iloc[index,col_ind] = temp_df[i][index]
y_pred = regressor.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(r2)
#and filling the nan's on arrivals_df
data_insurance = pd.concat([complete, incomplete])
data_insurance.set_index(data_insurance['Customer Identity'] - 1, inplace=True)
data_insurance.dropna(inplace=True)
data_insurance.isna().sum()
data_insurance
#Preparing to plot correlation
plot_data_insurance = deepcopy(data_insurance)
#plot_data_insurance.drop(env_params['Outliers'], inplace=True)
plot_data_insurance.drop('Customer Identity', axis=1, inplace=True)
#Plotting the correlation between all variables
plotCorrelation(plot_data_insurance)
#Since 'Age' and 'Gross Monthly Salary' have a high correlation and 'Age' appears not to be trustworthy (described above),
#we'll drop the 'Age' column
plot_data_insurance.drop('Age', axis=1, inplace=True)
data_insurance.drop('Age', axis=1, inplace=True)
#Since 'Customer Monetary Value' and 'Claims Rate' have a high (inverse) correlation, we'll maintain only the CMV
plot_data_insurance.drop('Claims Rate', axis=1, inplace=True)
data_insurance.drop('Claims Rate', axis=1, inplace=True)
#Separating variables
InsCon, ValEng = separateVariables(plot_data_insurance)
#Plotting the correlation between Insurance Consumption variables
plotCorrelation(InsCon)
#print(InsCon.mean(), '\n\n', InsCon.sum())
pd.options.display.float_format = '$ {:,.2f}'.format
ins_sum = pd.DataFrame(InsCon.sum(), columns=['Sum'])
ins_mean = pd.DataFrame(InsCon.mean(), columns=['Mean'])
ins_median = pd.DataFrame(InsCon.median(), columns=['Median'])
pd.concat([ins_sum, ins_mean, ins_median], axis=1, sort=False)
pd.options.display.float_format = '{:,.2f}'.format
#Plotting the correlation between Customer Value & Engagement variables
plotCorrelation(ValEng)
def quartilePlotter(df, y_variables, x_variable):
widths = [6, 6, 6]
heights = [6, 6]
gs_kw = dict(width_ratios=widths, height_ratios=heights)
f,((ax1,ax2,ax3),(ax4,ax5,ax6)) = plt.subplots(2,3, figsize=(18,12), gridspec_kw=gs_kw)
#ax1.get_shared_y_axes().join(ax2,ax3)
#ax4.get_shared_y_axes().join(ax5)
g1 = sns.heatmap(quartileMatrix(df, y_variables[0], x_variable), annot=True, fmt="d", cmap="Greens",cbar=False,ax=ax1)
g1.set_ylabel(y_variables[0])
g1.set_xlabel(x_variable)
g1.set_ylim(4,0)
g2 = sns.heatmap(quartileMatrix(df, y_variables[1], x_variable), annot=True, fmt="d", cmap="Greens",cbar=False,ax=ax2)
g2.set_ylabel(y_variables[1])
g2.set_xlabel(x_variable)
g2.set_yticks([])
g3 = sns.heatmap(quartileMatrix(df, y_variables[2], x_variable),cmap="Greens", annot=True, fmt="d", cbar=False, ax=ax3)
g3.set_ylabel(y_variables[2])
g3.set_xlabel(x_variable)
g3.set_yticks([])
g4 = sns.heatmap(quartileMatrix(df, y_variables[3], x_variable), annot=True, fmt="d", cmap="Greens",cbar=False,ax=ax4)
g4.set_ylabel(y_variables[3])
g4.set_xlabel(x_variable)
g4.set_yticks([])
g5 = sns.heatmap(quartileMatrix(df, y_variables[4], x_variable),cmap="Greens", annot=True, fmt="d", cbar=False, ax=ax5)
g5.set_ylabel(y_variables[4])
g5.set_xlabel(x_variable)
g5.yaxis.tick_right()
g5.set_ylim(4, 0)
f.delaxes(ax6)
ax1.invert_yaxis()
ax2.invert_yaxis()
ax3.invert_yaxis()
ax4.invert_yaxis()
ax5.invert_yaxis()
#Plotting the quartile matrix for Insurance Consumption features
#Show insurance type sales potential in relation to sum of premiums
y_variables = ['Premiums in LOB: Motor',
'Premiums in LOB: Household',
'Premiums in LOB: Health',
'Premiums in LOB: Life',
'Premiums in LOB: Work Compensations']
x_variable = 'Premium: Sum'
quartilePlotter(InsCon, y_variables, x_variable)
#Plotting the quartile matrix for Insurance Consumption features versus Salary
#Show insurance type sales potential in relation to salary
y_variables = ['Premiums in LOB: Motor',
'Premiums in LOB: Household',
'Premiums in LOB: Health',
'Premiums in LOB: Life',
'Premiums in LOB: Work Compensations']
x_variable = 'Gross Monthly Salary'
quartilePlotter(plot_data_insurance, y_variables, x_variable)
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(9,6))
ax = sns.boxplot(x='Educational Degree', y='Premiums in LOB: Life', hue='Has Children (Y=1)', data=plot_data_insurance, palette="Set2")
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(9,6))
ax = sns.boxplot(x='Educational Degree', y='Premiums in LOB: Work Compensations', hue='Has Children (Y=1)', data=plot_data_insurance, palette="Set2")
fig, ax = plt.subplots(figsize=(9,6))
ax = sns.boxplot(x='Educational Degree', y='Premiums in LOB: Health', hue='Has Children (Y=1)', data=plot_data_insurance, palette="Set2")
quartiles_df = quartileMatrix(data_insurance, 'Premium: Sum', 'Customer Monetary Value')
#Decision tree implementation
#split dataset in features and target variable
feature_cols = ['Educational Degree',
'Geographic Living Area',
'Has Children (Y=1)',
'Gross Monthly Salary',
'First Policy´s Age']
#feature_cols = ['Gross Monthly Salary', 'First Policy´s Age']
X = data_insurance[feature_cols] # Dependent variables
y = data_insurance['f1_quartile'] # Target variable is Premium: Sum divided into quartiles
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 80% training and 20% test
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth=3)
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#Plot the decision tree
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['q1','q2','q3','q4'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('decision_tree.png')
Image(graph.create_png())
data_insurance.drop(['Customer Identity'],axis=1,inplace=True)
data_insurance.groupby(['Educational Degree','Geographic Living Area','Has Children (Y=1)'])['Customer Monetary Value'].count().sort_values(ascending=False)
#data_insurance.groupby(['Geographic Living Area','Educational Degree','Has Children (Y=1)'])['Claims Rate'].median().sort_values(ascending=False)
#data_insurance.groupby(['Has Children (Y=1)','Educational Degree','Geographic Living Area'])['Claims Rate'].median().sort_values(ascending=False)
data_insurance.drop(['f1_quartile','f2_quartile','Premium: Sum'],axis=1, inplace=True)
#Finding the number of clusters by the elbow graph.
dat_ins=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_ins))
K=list(range(1, 11))
list_distance = []
for aux in range(len(K)):
centroids,labels,_ = k_means(dat_ins,n_clusters=K[aux],n_init=50,n_jobs=8)
total_distance = 0
L=[]
my_labels = list(range(len(centroids)))
j=0
while j < len(my_labels):
i=0
while i < dat_ins.shape[0]:
if labels[i] == my_labels[j]:
newDistance=euclidean(centroids[j,:],dat_ins.iloc[i,:])
L.append(newDistance)
i = i+1
j = j+1
total_distance = sum(L)
list_distance.append(total_distance)
print(aux)
fig, ax = plt.subplots(figsize=(15,5))
plt.plot(K, list_distance, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distance')
plt.title('Elbow Graph')
plt.savefig("images/elbowplot.png")
plt.show()
#Comparing with the number of clusters by the hierarchical clustering.
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(10,10))
Z = linkage(dat_ins,
method='ward',
metric='euclidean',
optimal_ordering=False)
dendrogram(Z,
truncate_mode='level',
p=2,
orientation = 'top',
leaf_rotation=45,
leaf_font_size= 16,
show_contracted=False,
show_leaf_counts=True)
plt.title('Truncated Hierarchical Clustering Dendrogram')
plt.xlabel('Cluster Size')
plt.ylabel('Distance')
plt.axhline(y=30,c='k')
plt.axhline(y=23.35,c='k')
plt.axhline(y=19,c='k')
plt.savefig("images/dendro.png")
plt.show()
from sklearn.cluster import AgglomerativeClustering
k = 4
dat_insu=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_insu))
dat_insu.columns=['Gross Monthly Salary','Customer Monetary Value',
'Premiums in LOB: Motor', 'Premiums in LOB: Household',
'Premiums in LOB: Health', 'Premiums in LOB: Life',
'Premiums in LOB: Work Compensations', 'First Policy´s Age']
Hclustering = AgglomerativeClustering(n_clusters=k,
affinity='euclidean',
linkage='ward')
#Replace the test with proper data
my_HC = Hclustering.fit(dat_ins)
my_labels = pd.DataFrame(my_HC.labels_)
my_labels.columns = ['Labels']
da=dat_insu.reset_index()
ca=pd.DataFrame(pd.concat([da, my_labels], axis=1))
# Do the necessary transformations
final_result = ca.groupby(by='Labels').count()
dat_ins
dat_ins.columns[1]
ncl=4
dat_insu=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_insu))
dat_insu.columns=['Gross Monthly Salary','Customer Monetary Value',
'Premiums in LOB: Motor', 'Premiums in LOB: Household',
'Premiums in LOB: Health', 'Premiums in LOB: Life',
'Premiums in LOB: Work Compensations', 'First Policy´s Age']
centroids,labels,_=k_means(dat_ins,n_clusters=ncl,n_init=500,n_jobs=8)
ncl=4
fig = plt.figure(figsize=(10, 20))
cent=a.inverse_transform(centroids)
l=0
fig.suptitle(f'Kmeans with {dat_insu.columns[l]} as X-Axis')
#plt.show()
#plt.ylabel()
#plt.xlabel(data.columns[-1])
for j in range(0,8):
if j == l:
continue
ax = fig.add_subplot(4, 2, j+1)
plt.xlabel(f'{dat_insu.columns[l]}')
plt.ylabel(f'{dat_insu.columns[j]}')
for i in range(ncl):
ax.scatter(dat_insu.iloc[labels==i,l],dat_insu.iloc[labels==i,j])
ax.plot(cent[:,l],cent[:,j],'sk',markersize=10)
plt.savefig("images/kmeans"+str(l)+".png")
plt.show()
from sklearn.cluster import MeanShift, estimate_bandwidth
dat_insu=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_insu))
dat_insu.columns=['Gross Monthly Salary','Customer Monetary Value',
'Premiums in LOB: Motor', 'Premiums in LOB: Household',
'Premiums in LOB: Health', 'Premiums in LOB: Life',
'Premiums in LOB: Work Compensations', 'First Policy´s Age']
# The following bandwidth can be automatically detected using
my_bandwidth = estimate_bandwidth(dat_ins, quantile=0.06, n_jobs=8)
print(my_bandwidth)
ms = MeanShift(bandwidth=my_bandwidth,
#bandwidth=0.15,
bin_seeding=True,
n_jobs=8)
ms.fit(dat_ins)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print(n_clusters_)
ncl=3
cent=a.inverse_transform(cluster_centers)
for i in range(8):
fig = plt.figure(figsize=(10, 20))
l=i
fig.suptitle(f'Mean Shift with {dat_insu.columns[l]} as X-Axis')
#plt.show()
#plt.ylabel()
#plt.xlabel(data.columns[-1])
for j in range(0,8):
if j == l:
continue
ax = fig.add_subplot(4, 2, j+1)
plt.xlabel(f'{dat_insu.columns[l]}')
plt.ylabel(f'{dat_insu.columns[j]}')
for i in range(ncl):
ax.scatter(dat_insu.iloc[labels==i,l],dat_insu.iloc[labels==i,j])
ax.plot(cent[:,l],cent[:,j],'sk',markersize=10)
plt.savefig("images/mean"+str(l)+".png")
#plt.show()
my_labels = pd.DataFrame(labels)
my_labels.columns = ['Labels']
da=dat_insu.reset_index().drop(['Customer Identity'],axis=1)
ca=pd.DataFrame(pd.concat([da, my_labels], axis=1))
# Do the necessary transformations
final_result = ca.groupby(by='Labels').mean()
final_result
from sklearn.mixture import GaussianMixture
dat_insu=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_insu))
dat_insu.columns=['Gross Monthly Salary','Customer Monetary Value',
'Premiums in LOB: Motor', 'Premiums in LOB: Household',
'Premiums in LOB: Health', 'Premiums in LOB: Life',
'Premiums in LOB: Work Compensations', 'First Policy´s Age']
gmm = GaussianMixture(n_components= 4,
init_params='kmeans', # {‘kmeans’, ‘random’}, defaults to ‘kmeans’.
max_iter=1000,
n_init=20)
gmm.fit(dat_ins)
labels = gmm.predict(dat_ins)
cluster_centers=gmm.means_
print(len(np.unique(labels)))
cent=a.inverse_transform(cluster_centers)
ncl=4
for i in range(8):
fig = plt.figure(figsize=(10, 20))
fig.suptitle(f'Gaussian Mixture with {dat_insu.columns[l]} as X-Axis')
l=i
#plt.show()
#plt.ylabel()
#plt.xlabel(data.columns[-1])
for j in range(0,8):
if j == l:
continue
ax = fig.add_subplot(4, 2, j+1)
plt.xlabel(f'{dat_insu.columns[l]}')
plt.ylabel(f'{dat_insu.columns[j]}')
for i in range(ncl):
ax.scatter(dat_insu.iloc[labels==i,l],dat_insu.iloc[labels==i,j])
ax.plot(cent[:,l],cent[:,j],'sk',markersize=10)
plt.savefig("images/gaus"+str(l)+".png")
#plt.show()
from sklearn.cluster import DBSCAN
dat_insu=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_insu))
dat_insu.columns=['Gross Monthly Salary','Customer Monetary Value',
'Premiums in LOB: Motor', 'Premiums in LOB: Household',
'Premiums in LOB: Health', 'Premiums in LOB: Life',
'Premiums in LOB: Work Compensations', 'First Policy´s Age']
db = DBSCAN(eps= 0.25,
min_samples=10,n_jobs=8).fit(dat_ins)
labels = db.labels_
print(len(np.unique(labels)))
for i in range(8):
l=i
ncl=len(np.unique(labels))
fig = plt.figure(figsize=(10, 20))
fig.suptitle(f'DBSCAN with {dat_insu.columns[l]} as X-Axis')
#plt.show()
#plt.ylabel()
#plt.xlabel(data.columns[-1])
for j in range(0,8):
if j == l:
continue
ax = fig.add_subplot(4, 2, j+1)
plt.xlabel(f'{dat_insu.columns[l]}')
plt.ylabel(f'{dat_insu.columns[j]}')
for i in range(ncl):
ax.scatter(dat_insu.iloc[labels==i,l],dat_insu.iloc[labels==i,j])
plt.savefig("images/dbscan"+str(l)+".png")
#plt.show()
data_insurance.columns
from kmodes.kmodes import KModes
dat_ins=data_insurance.drop(['Gross Monthly Salary','Customer Monetary Value','Premiums in LOB: Motor','Premiums in LOB: Household','Premiums in LOB: Health','Premiums in LOB: Life','Premiums in LOB: Work Compensations','First Policy´s Age'],axis=1)
dat_ins.columns=['Educational Degree', 'Geographic Living Area', 'Has Children (Y=1)']
km = KModes(n_clusters=2, init='random', n_init=128, verbose=1,n_jobs=8)
clusters = km.fit_predict(dat_ins)
ncl=len(np.unique(clusters))
fig = plt.figure(figsize=(10, 20))
l=0
fig.suptitle(f'KModes with {dat_ins.columns[l]} as X-Axis')
#plt.show()
#plt.ylabel()
#plt.xlabel(data.columns[-1])
for j in range(3):
if j == l:
continue
ax = fig.add_subplot(2, 2, j)
plt.xlabel(f'{dat_ins.columns[l]}')
plt.ylabel(f'{dat_ins.columns[j]}')
for i in range(ncl):
ax.scatter(dat_ins.iloc[clusters==i,l],dat_ins.iloc[clusters==i,j])
plt.savefig("images/kmode.png")
plt.show()
outliers_data = pd.DataFrame(outliers_data)
outliers_data.isna().sum()
len(outliers_data[outliers_data.isna().any(axis=1)])
outliers_data.columns
col = ca.columns
outliers_data = outliers_data.loc[:,col]
outliers_data.drop('Labels', axis=1, inplace=True)
outliers_data.dropna(inplace=True, axis=0)
outliers_data.isna().sum()
clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(ca.iloc[:,:-1] , ca.iloc[:,-1:])
pred = clf.predict(outliers_data)
outliers_data['Labels'] = pred
outliers_data.head(15)